Lab 6

Author

Emily Mitchell

Published

April 4, 2025

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
✔ broom        1.0.7     ✔ rsample      1.2.1
✔ dials        1.4.0     ✔ tune         1.3.0
✔ infer        1.0.7     ✔ workflows    1.2.0
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.3.0     ✔ yardstick    1.3.2
✔ recipes      1.1.1     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
library(powerjoin)
library(glue)
library(vip)

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi
library(baguette)
root  <- 'https://gdex.ucar.edu/dataset/camels/file'
download.file('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf', 
              'data/camels_attributes_v2.0.pdf')
types <- c("clim", "geol", "soil", "topo", "vege", "hydro")
# Where the files live online ...
remote_files  <- glue('{root}/camels_{types}.txt')
# where we want to download the data ...
local_files   <- glue('data/camels_{types}.txt')
walk2(remote_files, local_files, download.file, quiet = TRUE)
camels <- map(local_files, read_delim, show_col_types = FALSE)
camels <- power_full_join(camels ,by = 'gauge_id')

Question 1

zero_q_freq represents the frequency of days with Q = 0 mm/day as a %.

ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = q_mean)) +
  scale_color_gradient(low = "pink", high = "dodgerblue") +
  ggthemes::theme_map()

Question 2

p1 <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = aridity)) +
  scale_color_gradient(low = "pink", high = "dodgerblue") +
  labs(title = "Aridity of Sites") +
  ggthemes::theme_map()

p2 <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = p_mean)) +
  scale_color_gradient(low = "pink", high = "dodgerblue") +
  ggthemes::theme_map() + 
  labs(title = "P Mean of Sites")
library(patchwork)
print(p1 | p2)

camels |> 
  select(aridity, p_mean, q_mean) |> 
  drop_na() |> 
  cor()
           aridity     p_mean     q_mean
aridity  1.0000000 -0.7550090 -0.5817771
p_mean  -0.7550090  1.0000000  0.8865757
q_mean  -0.5817771  0.8865757  1.0000000
# Create a scatter plot of aridity vs rainfall
ggplot(camels, aes(x = aridity, y = p_mean)) +
  # Add points colored by mean flow
  geom_point(aes(color = q_mean)) +
  # Add a linear regression line
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  # Apply the viridis color scale
  scale_color_viridis_c() +
  # Add a title, axis labels, and theme (w/ legend on the bottom)
  theme_linedraw() + 
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  scale_color_viridis_c() +
  # Apply log transformations to the x and y axes
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  # Apply a log transformation to the color scale
  scale_color_viridis_c(trans = "log") +
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom",
        # Expand the legend width ...
        legend.key.width = unit(2.5, "cm"),
        legend.key.height = unit(.5, "cm")) + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow") 
`geom_smooth()` using formula = 'y ~ x'

set.seed(123)

camels <- camels |> 
  mutate(logQmean = log(q_mean))


camels_split <- initial_split(camels, prop = 0.8)
camels_train <- training(camels_split)
camels_test  <- testing(camels_split)

camels_cv <- vfold_cv(camels_train, v = 10)
rec <-  recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%
  step_log(all_predictors()) %>%
  step_interact(terms = ~ aridity:p_mean) |> 
  step_naomit(all_predictors(), all_outcomes())
baked_data <- prep(rec, camels_train) |> 
  bake(new_data = NULL)

lm_base <- lm(logQmean ~ aridity * p_mean, data = baked_data)
summary(lm_base)

Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.77586    0.16365 -10.852  < 2e-16 ***
aridity        -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean          1.48438    0.15511   9.570  < 2e-16 ***
aridity:p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
summary(lm(logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))

Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean, 
    data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -1.77586    0.16365 -10.852  < 2e-16 ***
aridity          -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean            1.48438    0.15511   9.570  < 2e-16 ***
aridity_x_p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
test_data <-  bake(prep(rec), new_data = camels_test)
test_data$lm_pred <- predict(lm_base, newdata = test_data)
metrics(test_data, truth = logQmean, estimate = lm_pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(test_data, aes(x = logQmean, y = lm_pred, colour = aridity)) +
  # Apply a gradient color scale
  scale_color_gradient2(low = "brown", mid = "orange", high = "darkgreen") +
  geom_point() +
  geom_abline(linetype = 2) +
  theme_linedraw() + 
  labs(title = "Linear Model: Observed vs Predicted",
       x = "Observed Log Mean Flow",
       y = "Predicted Log Mean Flow",
       color = "Aridity")

# Define model
lm_model <- linear_reg() %>%
  # define the engine
  set_engine("lm") %>%
  # define the mode
  set_mode("regression")

# Instantiate a workflow ...
lm_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(lm_model) %>%
  # Fit the model to the training data
  fit(data = camels_train) 

# Extract the model coefficients from the workflow
summary(extract_fit_engine(lm_wf))$coefficients
                   Estimate Std. Error    t value     Pr(>|t|)
(Intercept)      -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity          -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean            1.4843771 0.15511117   9.569762 4.022500e-20
aridity_x_p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
summary(lm_base)$coefficients
                 Estimate Std. Error    t value     Pr(>|t|)
(Intercept)    -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity        -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean          1.4843771 0.15511117   9.569762 4.022500e-20
aridity:p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
lm_data <- augment(lm_wf, new_data = camels_test)
dim(lm_data)
[1] 135  61
metrics(lm_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(lm_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

library(baguette)
library(ranger)
rf_model <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(rf_model) %>%
  # Fit the model
  fit(data = camels_train) 
rf_data <- augment(rf_wf, new_data = camels_test)
dim(rf_data)
[1] 135  60
metrics(rf_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.587
2 rsq     standard       0.741
3 mae     standard       0.363
ggplot(rf_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

wf <- workflow_set(list(rec), list(lm_model, rf_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv) 

autoplot(wf)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 4 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_rand_fore… Prepro… rmse    0.563  0.0247    10 recipe       rand…     1
2 recipe_rand_fore… Prepro… rsq     0.771  0.0259    10 recipe       rand…     1
3 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     2
4 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     2

Question 3

xgb_model <- boost_tree() %>%
  set_engine("xgboost") %>%
  set_mode("regression")

nn_model <- bag_mlp() %>%
  set_engine("nnet") %>%
  set_mode("regression")
library(xgboost)

Attaching package: 'xgboost'
The following object is masked from 'package:dplyr':

    slice
wf <- workflow_set(list(rec), list(lm_model, rf_model, xgb_model, nn_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv) 

autoplot(wf)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 8 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_bag_mlp    Prepro… rmse    0.547  0.0308    10 recipe       bag_…     1
2 recipe_bag_mlp    Prepro… rsq     0.787  0.0266    10 recipe       bag_…     1
3 recipe_rand_fore… Prepro… rmse    0.565  0.0247    10 recipe       rand…     2
4 recipe_rand_fore… Prepro… rsq     0.770  0.0258    10 recipe       rand…     2
5 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     3
6 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     3
7 recipe_boost_tree Prepro… rmse    0.600  0.0289    10 recipe       boos…     4
8 recipe_boost_tree Prepro… rsq     0.745  0.0268    10 recipe       boos…     4

I would move forward with the neural network model because it has the highest r-squared value.

Question 4

Data Splitting

set.seed(123)

camels1 <- camels |> 
  mutate(logQmean = log(q_mean))

camels_split <- initial_split(camels1, prop = 0.75)
camels_train <- training(camels_split)
camels_test  <- testing(camels_split)

camels_cv <- vfold_cv(camels_train, v = 10)

Recipe

rec1 <-  recipe(logQmean ~ aridity + p_mean + soil_porosity, data = camels_train) %>%
  step_log(all_predictors()) %>%
  step_interact(terms = ~ aridity:p_mean) %>%  
  step_naomit(all_predictors(), all_outcomes())


baked_data <- prep(rec1, camels_train) |> 
  bake(new_data = NULL)

lm_base1 <- lm(logQmean ~ ., data = baked_data)
summary(lm_base1)

Call:
lm(formula = logQmean ~ ., data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.93279 -0.21727  0.00367  0.21075  2.82522 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -2.49567    0.41223  -6.054 2.79e-09 ***
aridity          -0.94477    0.17282  -5.467 7.26e-08 ***
p_mean            1.41806    0.16511   8.588  < 2e-16 ***
soil_porosity    -0.95656    0.50564  -1.892   0.0591 .  
aridity_x_p_mean  0.09854    0.07584   1.299   0.1945    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5729 on 497 degrees of freedom
Multiple R-squared:  0.771, Adjusted R-squared:  0.7691 
F-statistic: 418.3 on 4 and 497 DF,  p-value: < 2.2e-16

I am adding soil porosity as a predictor in my model because I think it does have an impact on daily discharge.

Define 3 Models

rf_mod <- rand_forest() %>% 
  set_engine('ranger') %>% 
  set_mode("regression")

b_mod <- boost_tree() %>% 
  set_engine('xgboost') %>% 
  set_mode("regression")

nn_mod <- mlp(hidden = 10) %>% 
  set_engine('nnet') %>% 
  set_mode("regression")

Workflow set

wf1 <- workflow_set(list(rec1), list(
                                   rf_mod,
                                   b_mod,
                                   nn_mod)) %>% 
  workflow_map(resamples = camels_cv)

Evaluation I think the random forest model is the best because it has the highest r-squared and lowest root mean squared error.

autoplot(wf1) 

ranked_results <- rank_results(wf1, rank_metric = "rsq",
               select_best = TRUE)
print(ranked_results)
# A tibble: 6 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_rand_fore… Prepro… rmse    0.509  0.0270    10 recipe       rand…     1
2 recipe_rand_fore… Prepro… rsq     0.819  0.0185    10 recipe       rand…     1
3 recipe_boost_tree Prepro… rmse    0.542  0.0300    10 recipe       boos…     2
4 recipe_boost_tree Prepro… rsq     0.801  0.0225    10 recipe       boos…     2
5 recipe_mlp        Prepro… rmse    0.548  0.0321    10 recipe       mlp       3
6 recipe_mlp        Prepro… rsq     0.790  0.0257    10 recipe       mlp       3

Extract and Evaluate

rf_wf1 <- workflow() %>%
  add_recipe(rec1) %>%
  add_model(rf_mod) %>%
  fit(data = camels_train) 


rf_data <- augment(rf_wf1, new_data = camels_test)
dim(rf_data)
[1] 168  60
ggplot(rf_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_gradient(low = "pink", high = "purple") +
  geom_point() +
  geom_abline() +
  theme_linedraw()

Overall, I think this model does a pretty good job of predicting mean daily discharge. The scatter of points around the line indicate that there are some prediction errors, but the general trend highlights the fact that the model is capturing the underlying relationship.